@misc{cobbe2021training,
      title={Training Verifiers to Solve Math Word Problems}, 
      author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Mark Chen and Heewoo Jun and Lukasz Kaiser and Matthias Plappert and Jerry Tworek and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
      year={2021},
      eprint={2110.14168},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}


@misc{pang2022quality,
      title={QuALITY: Question Answering with Long Input Texts, Yes!}, 
      author={Richard Yuanzhe Pang and Alicia Parrish and Nitish Joshi and Nikita Nangia and Jason Phang and Angelica Chen and Vishakh Padmakumar and Johnny Ma and Jana Thompson and He He and Samuel R. Bowman},
      year={2022},
      eprint={2112.08608},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{tseng2016towards,
  title     = {Towards machine comprehension of spoken content: Initial TOEFL listening comprehension test by machine},
  author    = {Tseng, Bo-Hsiang and Shen, Sheng-Syun and Lee, Hung-Yi and Lee, Lin-Shan},
  booktitle = {INTERSPEECH},
  year      = {2016}
}

@inproceedings{chung2018supervised,
  title     = {Supervised and unsupervised transfer learning for question answering},
  author    = {Chung, Yu-An and Lee, Hung-Yi and Glass, James},
  booktitle = {NAACL HLT},
  year      = {2018}
}

@misc{hendrycks2021cuad,
      title={CUAD: An Expert-Annotated NLP Dataset for Legal Contract Review}, 
      author={Dan Hendrycks and Collin Burns and Anya Chen and Spencer Ball},
      year={2021},
      eprint={2103.06268},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}


@inproceedings{Feng_2021,
	doi = {10.18653/v1/2021.emnlp-main.498},
  
	url = {https://doi.org/10.18653%2Fv1%2F2021.emnlp-main.498},
  
	year = 2021,
	publisher = {Association for Computational Linguistics},
  
	author = {Song Feng and Siva Sankalp Patel and Hui Wan and Sachindra Joshi},
  
	title = {{MultiDoc}2Dial: Modeling Dialogues Grounded in Multiple Documents},
  
	booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}
}

@article{kwiatkowski-etal-2019-natural,
    title = "Natural Questions: A Benchmark for Question Answering Research",
    author = "Kwiatkowski, Tom  and
      Palomaki, Jennimaria  and
      Redfield, Olivia  and
      Collins, Michael  and
      Parikh, Ankur  and
      Alberti, Chris  and
      Epstein, Danielle  and
      Polosukhin, Illia  and
      Devlin, Jacob  and
      Lee, Kenton  and
      Toutanova, Kristina  and
      Jones, Llion  and
      Kelcey, Matthew  and
      Chang, Ming-Wei  and
      Dai, Andrew M.  and
      Uszkoreit, Jakob  and
      Le, Quoc  and
      Petrov, Slav",
    journal = "Transactions of the Association for Computational Linguistics",
    volume = "7",
    year = "2019",
    address = "Cambridge, MA",
    publisher = "MIT Press",
    url = "https://aclanthology.org/Q19-1026",
    doi = "10.1162/tacl_a_00276",
    pages = "452--466",
    abstract = "We present the Natural Questions corpus, a question answering data set. Questions consist of real anonymized, aggregated queries issued to the Google search engine. An annotator is presented with a question along with a Wikipedia page from the top 5 search results, and annotates a long answer (typically a paragraph) and a short answer (one or more entities) if present on the page, or marks null if no long/short answer is present. The public release consists of 307,373 training examples with single annotations; 7,830 examples with 5-way annotations for development data; and a further 7,842 examples with 5-way annotated sequestered as test data. We present experiments validating quality of the data. We also describe analysis of 25-way annotations on 302 examples, giving insights into human variability on the annotation task. We introduce robust metrics for the purposes of evaluating question answering systems; demonstrate high human upper bounds on these metrics; and establish baseline results using competitive methods drawn from related literature.",
}


@misc{kočiský2017narrativeqa,
      title={The NarrativeQA Reading Comprehension Challenge}, 
      author={Tomáš Kočiský and Jonathan Schwarz and Phil Blunsom and Chris Dyer and Karl Moritz Hermann and Gábor Melis and Edward Grefenstette},
      year={2017},
      eprint={1712.07040},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{dasigi2021dataset,
      title={A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers}, 
      author={Pradeep Dasigi and Kyle Lo and Iz Beltagy and Arman Cohan and Noah A. Smith and Matt Gardner},
      year={2021},
      eprint={2105.03011},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{huang-etal-2021-efficient,
    title = "Efficient Attentions for Long Document Summarization",
    author = "Huang, Luyang  and
      Cao, Shuyang  and
      Parulian, Nikolaus  and
      Ji, Heng  and
      Wang, Lu",
    booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
    month = jun,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.naacl-main.112",
    doi = "10.18653/v1/2021.naacl-main.112",
    pages = "1419--1436",
    abstract = "The quadratic computational and memory complexities of large Transformers have limited their scalability for long document summarization. In this paper, we propose Hepos, a novel efficient encoder-decoder attention with head-wise positional strides to effectively pinpoint salient information from the source. We further conduct a systematic study of existing efficient self-attentions. Combined with Hepos, we are able to process ten times more tokens than existing models that use full attentions. For evaluation, we present a new dataset, GovReport, with significantly longer documents and summaries. Results show that our models produce significantly higher ROUGE scores than competitive comparisons, including new state-of-the-art results on PubMed. Human evaluation also shows that our models generate more informative summaries with fewer unfaithful errors.",
}


@misc{zhong2021qmsum,
      title={QMSum: A New Benchmark for Query-based Multi-domain Meeting Summarization}, 
      author={Ming Zhong and Da Yin and Tao Yu and Ahmad Zaidi and Mutethia Mutuma and Rahul Jha and Ahmed Hassan Awadallah and Asli Celikyilmaz and Yang Liu and Xipeng Qiu and Dragomir Radev},
      year={2021},
      eprint={2104.05938},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{fabbri2019multinews,
      title={Multi-News: a Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model}, 
      author={Alexander R. Fabbri and Irene Li and Tianwei She and Suyi Li and Dragomir R. Radev},
      year={2019},
      eprint={1906.01749},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{yuan2021automate,
      title={Can We Automate Scientific Reviewing?}, 
      author={Weizhe Yuan and Pengfei Liu and Graham Neubig},
      year={2021},
      eprint={2102.00176},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{sharma-etal-2019-bigpatent,
    title = "{BIGPATENT}: A Large-Scale Dataset for Abstractive and Coherent Summarization",
    author = "Sharma, Eva  and
      Li, Chen  and
      Wang, Lu",
    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2019",
    address = "Florence, Italy",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/P19-1212",
    doi = "10.18653/v1/P19-1212",
    pages = "2204--2213",
    abstract = "Most existing text summarization datasets are compiled from the news domain, where summaries have a flattened discourse structure. In such datasets, summary-worthy content often appears in the beginning of input articles. Moreover, large segments from input articles are present verbatim in their respective summaries. These issues impede the learning and evaluation of systems that can understand an article{'}s global content structure as well as produce abstractive summaries with high compression ratio. In this work, we present a novel dataset, BIGPATENT, consisting of 1.3 million records of U.S. patent documents along with human written abstractive summaries. Compared to existing summarization datasets, BIGPATENT has the following properties: i) summaries contain a richer discourse structure with more recurring entities, ii) salient content is evenly distributed in the input, and iii) lesser and shorter extractive fragments are present in the summaries. Finally, we train and evaluate baselines and popular learning models on BIGPATENT to shed light on new challenges and motivate future directions for summarization research.",
}


@article{angelidis-etal-2021-extractive,
    title = "Extractive Opinion Summarization in Quantized Transformer Spaces",
    author = "Angelidis, Stefanos  and
      Amplayo, Reinald Kim  and
      Suhara, Yoshihiko  and
      Wang, Xiaolan  and
      Lapata, Mirella",
    journal = "Transactions of the Association for Computational Linguistics",
    volume = "9",
    year = "2021",
    address = "Cambridge, MA",
    publisher = "MIT Press",
    url = "https://aclanthology.org/2021.tacl-1.17",
    doi = "10.1162/tacl_a_00366",
    pages = "277--293",
    abstract = "We present the Quantized Transformer (QT), an unsupervised system for extractive opinion summarization. QT is inspired by Vector- Quantized Variational Autoencoders, which we repurpose for popularity-driven summarization. It uses a clustering interpretation of the quantized space and a novel extraction algorithm to discover popular opinions among hundreds of reviews, a significant step towards opinion summarization of practical scope. In addition, QT enables controllable summarization without further training, by utilizing properties of the quantized space to extract aspect-specific summaries. We also make publicly available Space, a large-scale evaluation benchmark for opinion summarizers, comprising general and aspect-specific summaries for 50 hotels. Experiments demonstrate the promise of our approach, which is validated by human studies where judges showed clear preference for our method over competitive baselines.",
}



@misc{chen2022summscreen,
      title={SummScreen: A Dataset for Abstractive Screenplay Summarization}, 
      author={Mingda Chen and Zewei Chu and Sam Wiseman and Kevin Gimpel},
      year={2022},
      eprint={2104.07091},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}


